TL;DR

Applying our zero-inflated model to the data from Zeisel et al. (2015). The data are from the mouse cortex and hippocampus. We have information about the tissue of origin and about the clusters found by the authors.

Unnormalized data

I will focus on the top 1,000 most variable genes. Note that the data are not public, hence it should not work other than on Davide’s computer.

data <- read.table("expression_mRNA_17-Aug-2014.txt", sep='\t', stringsAsFactors = FALSE, comment.char = '%')

tissue <- as.factor(as.matrix(data)[1,-(1:2)])
table(tissue)
## tissue
## ca1hippocampus       sscortex 
##           1314           1691
group <- as.factor(as.matrix(data)[2,-(1:2)])
table(tissue, group)
##                 group
## tissue             1   2   3   4   5   6   7   8   9
##   ca1hippocampus 126  20 919 121  14  18  64  17  15
##   sscortex       164 370  29 699  84 157 134   9  45
nmolecule <- as.numeric(as.matrix(data)[3,-(1:2)])
sex <- as.factor(as.matrix(data)[5,-(1:2)])
table(tissue, sex)
##                 sex
## tissue            -1   0   1
##   ca1hippocampus 748  46 520
##   sscortex       776   0 915
level1 <- as.factor(as.matrix(data)[9,-(1:2)])
level2 <- as.factor(as.matrix(data)[10,-(1:2)])
table(level1)
## level1
## astrocytes_ependymal    endothelial-mural         interneurons 
##                  224                  235                  290 
##            microglia     oligodendrocytes        pyramidal CA1 
##                   98                  820                  939 
##         pyramidal SS 
##                  399
table(level1, level2)
##                       level2
## level1                 (none) Astro1 Astro2 CA1Pyr1 CA1Pyr2 CA1PyrInt
##   astrocytes_ependymal     65     68     61       0       0         0
##   endothelial-mural        15      0      0       0       0         0
##   interneurons              0      0      0       0       0         0
##   microglia                 0      0      0       0       0         0
##   oligodendrocytes          0      0      0       0       0         0
##   pyramidal CA1             0      0      0     380     447        49
##   pyramidal SS            109      0      0       0       0         0
##                       level2
## level1                 CA2Pyr2 Choroid ClauPyr Epend Int1 Int10 Int11
##   astrocytes_ependymal       0      10       0    20    0     0     0
##   endothelial-mural          0       0       0     0    0     0     0
##   interneurons               0       0       0     0   12    21    10
##   microglia                  0       0       0     0    0     0     0
##   oligodendrocytes           0       0       0     0    0     0     0
##   pyramidal CA1             41       0       0     0    0     0     0
##   pyramidal SS               0       0       5     0    0     0     0
##                       level2
## level1                 Int12 Int13 Int14 Int15 Int16 Int2 Int3 Int4 Int5
##   astrocytes_ependymal     0     0     0     0     0    0    0    0    0
##   endothelial-mural        0     0     0     0     0    0    0    0    0
##   interneurons            21    15    22    18    20   24   10   15   20
##   microglia                0     0     0     0     0    0    0    0    0
##   oligodendrocytes         0     0     0     0     0    0    0    0    0
##   pyramidal CA1            0     0     0     0     0    0    0    0    0
##   pyramidal SS             0     0     0     0     0    0    0    0    0
##                       level2
## level1                 Int6 Int7 Int8 Int9 Mgl1 Mgl2 Oligo1 Oligo2 Oligo3
##   astrocytes_ependymal    0    0    0    0    0    0      0      0      0
##   endothelial-mural       0    0    0    0    0    0      0      0      0
##   interneurons           22   23   26   11    0    0      0      0      0
##   microglia               0    0    0    0   17   16      0      0      0
##   oligodendrocytes        0    0    0    0    0    0     45     98     87
##   pyramidal CA1           0    0    0    0    0    0      0      0      0
##   pyramidal SS            0    0    0    0    0    0      0      0      0
##                       level2
## level1                 Oligo4 Oligo5 Oligo6 Peric Pvm1 Pvm2 S1PyrDL
##   astrocytes_ependymal      0      0      0     0    0    0       0
##   endothelial-mural         0      0      0    21    0    0       0
##   interneurons              0      0      0     0    0    0       0
##   microglia                 0      0      0     0   32   33       0
##   oligodendrocytes        106    125    359     0    0    0       0
##   pyramidal CA1             0      0      0     0    0    0       0
##   pyramidal SS              0      0      0     0    0    0      81
##                       level2
## level1                 S1PyrL23 S1PyrL4 S1PyrL5 S1PyrL5a S1PyrL6 S1PyrL6b
##   astrocytes_ependymal        0       0       0        0       0        0
##   endothelial-mural           0       0       0        0       0        0
##   interneurons                0       0       0        0       0        0
##   microglia                   0       0       0        0       0        0
##   oligodendrocytes            0       0       0        0       0        0
##   pyramidal CA1               0       0       0        0       0        0
##   pyramidal SS               74      26      16       28      39       21
##                       level2
## level1                 SubPyr Vend1 Vend2 Vsmc
##   astrocytes_ependymal      0     0     0    0
##   endothelial-mural         0    32   105   62
##   interneurons              0     0     0    0
##   microglia                 0     0     0    0
##   oligodendrocytes          0     0     0    0
##   pyramidal CA1            22     0     0    0
##   pyramidal SS              0     0     0    0
counts <- as.matrix(data[12:NROW(data),-(1:2)])
counts <- matrix(as.numeric(counts), ncol=ncol(counts), nrow=nrow(counts))
rownames(counts) <- data[12:NROW(data),1]
colnames(counts) <- data[8, -(1:2)]

To speed up the computations, I will focus on the top 1,000 most variable genes.

vars <- rowVars(log1p(counts))
names(vars) <- rownames(counts)
vars <- sort(vars, decreasing = TRUE)
core <- counts[names(vars)[1:1000],]

PCA

First, let’s look at PCA (of the log counts) for reference.

par(mfrow=c(2, 2))
detection_rate <- colSums(core>0)
coverage <- colSums(core)

col1 <- brewer.pal(9, "Set1")
col2 <- brewer.pal(8, "Set2")

colTissue <- col1[tissue]
colMerged <- col2[level1]

pca <- prcomp(t(log1p(core)))
plot(pca$x, col=colMerged, pch=20, main="PCA of log-counts, centered not scaled")
legend("topleft", levels(level1), fill=col2)
plot(pca$x, col=colTissue, pch=20, main="PCA of log-counts, centered not scaled")

fq <- EDASeq::betweenLaneNormalization(counts, which="full")

pcafq <- prcomp(t(log1p(fq[rownames(core),])))
plot(pcafq$x, col=colMerged, pch=20, main="PCA of FQ log-counts, centered not scaled")
plot(pcafq$x, col=colTissue, pch=20, main="PCA of FQ log-counts, centered not scaled")

The first plot is the unnormalized data and the second plot is after full-quantile normalization, which is what Russell and Diya used for the paper.

They found that to fully explain the differences between clusters, we need three dimensions.

pairs(pcafq$x[,1:3], col=colMerged, pch=20, main="PCA of FQ log-counts, centered not scaled")

df <- data.frame(PC1=pcafq$x[,1], PC2=pcafq$x[,2], detection_rate=detection_rate, coverage=coverage)
pairs(df, pch=20, col=colMerged)

print(cor(df, method="spearman"))
##                       PC1       PC2 detection_rate   coverage
## PC1             1.0000000 0.1934101     -0.8208739 -0.6525198
## PC2             0.1934101 1.0000000      0.2749546  0.3998289
## detection_rate -0.8208739 0.2749546      1.0000000  0.8967131
## coverage       -0.6525198 0.3998289      0.8967131  1.0000000

Even after full-quantile normalization, there is some residual correlation between PC2 and detection rate.

ZINB

print(system.time(zinb_Vall <- zinbFit(core, ncores = ncores, K = 3)))
##     user   system  elapsed 
## 2485.394  537.528  421.206
pairs(zinb_Vall@W, col=colMerged, pch=20, main="ZINB")

pairs(zinb_Vall@W, col=colTissue, pch=20, main="ZINB")

df <- data.frame(W1=zinb_Vall@W[,1], W2=zinb_Vall@W[,2], W3=zinb_Vall@W[,3], gamma_mu = zinb_Vall@gamma_mu[1,], gamma_pi = zinb_Vall@gamma_pi[1,], detection_rate=detection_rate, coverage=coverage)
pairs(df, pch=20, col=colMerged)

print(cor(df, method="spearman"))
##                          W1           W2          W3    gamma_mu
## W1              1.000000000  0.006476424  0.01376237  0.63967883
## W2              0.006476424  1.000000000  0.06323623 -0.06000655
## W3              0.013762371  0.063236226  1.00000000 -0.09798784
## gamma_mu        0.639678834 -0.060006550 -0.09798784  1.00000000
## gamma_pi       -0.709274239  0.131792434  0.04380961 -0.79807959
## detection_rate  0.770531522 -0.161658393 -0.07725217  0.90629270
## coverage        0.584275001 -0.151893464 -0.10906387  0.97166090
##                   gamma_pi detection_rate   coverage
## W1             -0.70927424     0.77053152  0.5842750
## W2              0.13179243    -0.16165839 -0.1518935
## W3              0.04380961    -0.07725217 -0.1090639
## gamma_mu       -0.79807959     0.90629270  0.9716609
## gamma_pi        1.00000000    -0.94897403 -0.8232345
## detection_rate -0.94897403     1.00000000  0.8967131
## coverage       -0.82323454     0.89671312  1.0000000

Four dimensions

print(system.time(zinb_4 <- zinbFit(core, ncores = ncores, K = 4)))
## Warning in simpute.als(x, J, thresh, lambda, maxit, trace.it, warm.start, :
## Convergence not achieved by 100 iterations
##     user   system  elapsed 
## 5497.948  850.268  792.701
pairs(zinb_4@W, col=colMerged, pch=20, main="ZINB by cluster (level 1)")

pairs(zinb_4@W, col=colTissue, pch=20, main="ZINB by tissue")

pairs(zinb_4@W, col=col2[group], pch=20, main="ZINB by group")

Looking at subsets of cells

col3 <- c(rgb(0, 0, 0, alpha = 0), read.table("colors.txt", stringsAsFactors = FALSE, comment.char = "%")[,1])
plot(zinb_Vall@W, col=col3[level2], pch=20, main="ZINB")

plot(pcafq$x, col=col3[level2], pch=20, main="PCA (FQ)")

i <- 7
idx <- which(level1==levels(level1)[i])
print(system.time(zinb_sub <- zinbFit(core[,idx], ncores = ncores, K = 2)))
##    user  system elapsed 
## 231.858 113.440  44.999
plot(zinb_sub@W, xlab="W1", ylab="W2", pch=19, col=col3[level2[idx]])
legend("bottomright", levels(droplevels(level2[idx])), fill=unique(col3[level2[idx]]))

i <- 6
idx <- which(level1==levels(level1)[i])
print(system.time(zinb_sub <- zinbFit(core[,idx], ncores = ncores, K = 2)))
##    user  system elapsed 
## 662.902 259.453 132.388
plot(zinb_sub@W, xlab="W1", ylab="W2", pch=19, col=col3[level2[idx]])
legend("bottomright", levels(droplevels(level2[idx])), fill=unique(col3[level2[idx]]))